In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split,cross_val_score, KFold, cross_val_predict
from sklearn.decomposition import PCA as sklearn_pca
from sklearn.decomposition import PCA
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import preprocessing, decomposition
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model.stochastic_gradient import SGDClassifier
import time
In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')
artworks.columns
Out[2]:
In [3]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]
# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()
# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']
# Drop missing data.
artworks = artworks.dropna()
In [4]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'
In [5]:
artworks.Nationality.unique()
Out[5]:
In [6]:
artworks1 = artworks.loc[~(artworks['Nationality'] == '()')]
artworks1 = artworks1.loc[~(artworks['Nationality'] == '(nationality unknown)')]
artworks1 = artworks1.loc[~(artworks['Nationality'] == '(Nationality unknown)')]
artworks1 = artworks1.loc[~(artworks['Nationality'] == '(Nationality Unknown)')]
artworks1['Nationality'].replace('\\(multiple_nationalities\\)', '(Multiple Nationalities)', inplace=True)
In [7]:
artworks1['Nationality'] = artworks1['Nationality'].apply(str).str.replace('\(|\)','')
In [8]:
artworks1.Nationality.unique()
Out[8]:
In [9]:
artworks1 = artworks1.loc[~(artworks['Gender'] == '()')]
artworks1['Gender'].replace('(male)', '(Male)', inplace=True)
artworks1['Gender'].replace('\\(multiple_persons\\)', '(Various Painters)', inplace=True)
artworks1['Gender'] = artworks1['Gender'].apply(str).str.replace('\(|\)','')
In [10]:
artworks1.Gender.unique()
Out[10]:
In [11]:
artworks1['DateAcquired'] = pd.to_datetime(artworks1.DateAcquired)
artworks1['YearAcquired'] = artworks1.DateAcquired.dt.year
artworks1['YearAcquired'].dtype
Out[11]:
In [12]:
# Convert dates to start date, cutting down number of distinct examples.
artworks1['Date'] = pd.Series(artworks1.Date.str.extract(
'([0-9]{4})', expand=False))[:-1]
# Final column drops and NA drop.
X = artworks1.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date','URL','ThumbnailURL'], 1)
# Create dummies separately.
artists = pd.get_dummies(artworks1.Artist)
nationalities = pd.get_dummies(artworks1.Nationality)
dates = pd.get_dummies(artworks1.Date)
# Concat artists with other variables
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)
Y = artworks1.Department
In [13]:
#Preproces the ratings
names = X.columns
X_scaled = pd.DataFrame(preprocessing.scale(X), columns = names)
In [14]:
#PCA Analysis
# Build the correlation mtarix
correlation_matrix = X_scaled.corr()
#Calculate the eigenvectores & eigenvalues
eig_vals, eig_vecs = np.linalg.eig(correlation_matrix)
sklearn_pca = PCA(n_components=len(X_scaled.columns))
Y_sklearn = sklearn_pca.fit_transform(correlation_matrix)
#Plot the scree plot for visual analysis of the PCA features
plt.title('Scree Plot')
plt.plot(eig_vals)
plt.show()
#For additional aid, print the total variance explained by each of the eigenvalues
print('The percentage of total variance in the dataset explained \n', sklearn_pca.explained_variance_ratio_)
In [15]:
#Calculate Feature Importance using Random Forest
#Start and fit the Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_scaled, Y)
#Define feature importance
feature_importance = rf.feature_importances_
# Make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
#Plot the relative importance of each feature
plt.figure(figsize=(7, 20))
plt.subplot(1, 1, 1)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Features Selection (Random Forest)')
plt.show()
In [16]:
#Feature Selection using KBest
#Scores for the most relevant features start with the one that has more explanatory power
# Initialize and fit the model for features extraction
test = SelectKBest()
fit = test.fit(X_scaled, Y)
#Identify features with highest score from a predictive perspective
#Create dataframe with the features ordered by their explanatory power
features_names = X_scaled.columns
Bestfeatures = pd.DataFrame(fit.scores_, index = features_names)
Bestfeatures.columns = ['Best Features']
Bestfeatures.sort_values(by=['Best Features'], ascending=False).head(30)
Out[16]:
In [17]:
# Features selection with Recursive Feature Elimination RFE model
#Set up the max number of features as indicated by PCA Analysis: number of features = 6
n_features = len(X_scaled)
#Initialize the model and fit
lr = LogisticRegression()
rfe = RFE(lr,n_features)
fit = rfe.fit(X_scaled,Y)
# Summarize the features selection. Based on the number of features from the PCA analysis
#show all the features selected (true) and left out (false)
result_RFE = pd.DataFrame(list(zip(X_scaled.head(0), rfe.ranking_, rfe.support_)),columns=['Features','Ranking','Support'] )
result_RFE.sort_values('Ranking').head(30)
Out[17]:
In [21]:
X_selected = X_scaled[['Height (cm)', 'Width (cm)', 'French','American',
'Gender_Various Painters','Multiple Nationalities',
'YearAcquired','2003', 'Swiss', 'Spanish', '1971', '1926', 'Portuguese',
'1860', 'Dutch', 'Gender_Female', '1940', '1869', '2002',
'1857', '2004', '1843', '1873', '1914', 'Belgian', '1899', '1875']]
In [22]:
#Split the data into training and testing datasets. Split: 70/30; train/test
X_train, X_test, y_train, y_test = train_test_split(X_selected,Y, test_size=0.3, random_state=123)
#Initiating the cross validation generator, N splits = 5
kf = KFold(5)
In [36]:
# Initialize and fit the model.
mlp = MLPClassifier(max_iter=500, tol = 0.001)
#Tune hyperparameters
#Create range of values to fit parameters
hidden_layer_sizes= [(100,20,), (200,50,), (400,100,)]
activation = ['logistic']
learning_rate_init = [0.001, 0.01, 1]
parameters = {'hidden_layer_sizes': hidden_layer_sizes,
'activation': activation,
'learning_rate_init': learning_rate_init}
#Fit parameters using gridsearch
mlp_tuned = GridSearchCV(mlp, param_grid=parameters, cv=3)
#Fit the tunned classifier in the training space
mlp_tuned.fit(X_train, y_train)
#Print the best parameters
print(mlp_tuned.best_params_)
#Print mlp score
print(mlp_tuned.score(X_train, y_train))
In [37]:
#Fit on Test set
mlp_tuned.fit(X_test, y_test)
predtest_y = mlp_tuned.predict(X_test)
In [38]:
cross_val_score(mlp_tuned, X_test, y_test, cv=3).mean()
Out[38]:
In [39]:
Y.value_counts()/len(Y)
Out[39]:
In the artwork dataset, a multi-layer perceptron has been run on the selected features. The features have been selected using Random Forest, KBest and RFE.
The final features that have been used are:
'Height (cm)', 'Width (cm)', 'French','American','Gender_Various Painters','Multiple Nationalities','YearAcquired','2003', 'Swiss', 'Spanish', '1971', '1926', 'Portuguese', '1860', 'Dutch', 'Gender_Female', '1940', '1869', '2002', '1857', '2004', '1843', '1873', '1914', 'Belgian', '1899', '1875'
The overall accuracy has been increased form an initial 55% to 67% (cross validation accuracy) by increasing the number of hidden layers to 2 with 400 and 100 neurons. Furthermore, the learning rate_init has been set to 0.001 from 1e-4 and the activation functions is logistic. All the parameters have been set using GridsearchCV with 3 folds.